DATA1901

Executive Summary

Initial Data Analysis

Source, structure, reclassification, limitation of data collection, assumptions, define meaning of key variables, describe any assumption (idea about the data), data cleaning.

Code
library(readxl)
sheets = excel_sheets("all_data.xlsx")
print(sheets)
[1] "data_dictionary" "experiment1"     "experiment2"     "experiment3"    
[5] "pooled123"      
Code
all_data = read_excel("all_data.xlsx", sheet = sheets[5])
head(all_data)
# A tibble: 6 × 82
  Date                  PID BSSQ_1 BSSQ_2 BSSQ_3 BSSQ_4 BSSQ_5 BSSQ_6 BSSQ_7
  <dttm>              <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 2021-06-30 16:55:00  1201      0      0      0      0      0      0      0
2 2021-06-30 17:46:00  1351      4      2      0      1      4      0      0
3 2021-06-30 18:25:00  2351      1      7      0      0      2      0      0
4 2021-06-30 18:59:00  1151      0      0      0      0      0      0      0
5 2021-06-30 20:25:00  2201      0      0      0      0      0      0      0
6 2021-06-30 21:05:00  1251      2      5      1      0      4      0      0
# ℹ 73 more variables: BSSQ_8 <dbl>, BSSQ_9 <dbl>, BSSQ_10 <dbl>,
#   BSSQ_11 <dbl>, BSSQ_12 <dbl>, BSSQ_13 <dbl>, BSSQ_14 <dbl>, BSSQ_15 <dbl>,
#   BSSQ_16 <dbl>, BSTAI_1 <chr>, BSTAI_2 <chr>, BSTAI_3 <chr>, BSTAI_4 <chr>,
#   BSTAI_5 <chr>, BSTAI_6 <chr>, BE <dbl>, BVRA <dbl>, choice_env <chr>,
#   no_choice_env <chr>, PSTAI_1 <chr>, PSTAI_2 <chr>, PSTAI_3 <chr>,
#   PSTAI_4 <chr>, PSTAI_5 <chr>, PSTAI_6 <chr>, PE <dbl>, PVRA <dbl>,
#   ASSQ_1 <dbl>, ASSQ_2 <dbl>, ASSQ_3 <dbl>, ASSQ_4 <dbl>, ASSQ_5 <dbl>, …
Code
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
Code
data = all_data %>% select(age, gender, env, choice, BSSQ_5, ASSQ_5)
head(data)
# A tibble: 6 × 6
    age gender env   choice    BSSQ_5 ASSQ_5
  <dbl> <chr>  <chr> <chr>      <dbl>  <dbl>
1    30 Male   Snowy CHOICE         0      2
2    37 Female Snowy CHOICE         4      3
3    28 Female Snowy NO_CHOICE      2      1
4    39 Female Snowy CHOICE         0      0
5    37 Male   Snowy NO_CHOICE      0      0
6    18 Female Snowy CHOICE         4      0
Code
mutated_data = mutate(data, focus_lvl_diff = ASSQ_5 - BSSQ_5)
head(mutated_data)
# A tibble: 6 × 7
    age gender env   choice    BSSQ_5 ASSQ_5 focus_lvl_diff
  <dbl> <chr>  <chr> <chr>      <dbl>  <dbl>          <dbl>
1    30 Male   Snowy CHOICE         0      2              2
2    37 Female Snowy CHOICE         4      3             -1
3    28 Female Snowy NO_CHOICE      2      1             -1
4    39 Female Snowy CHOICE         0      0              0
5    37 Male   Snowy NO_CHOICE      0      0              0
6    18 Female Snowy CHOICE         4      0             -4
Code
filtered_data <- filter(mutated_data, gender != 'Other')
head(filtered_data)
# A tibble: 6 × 7
    age gender env   choice    BSSQ_5 ASSQ_5 focus_lvl_diff
  <dbl> <chr>  <chr> <chr>      <dbl>  <dbl>          <dbl>
1    30 Male   Snowy CHOICE         0      2              2
2    37 Female Snowy CHOICE         4      3             -1
3    28 Female Snowy NO_CHOICE      2      1             -1
4    39 Female Snowy CHOICE         0      0              0
5    37 Male   Snowy NO_CHOICE      0      0              0
6    18 Female Snowy CHOICE         4      0             -4

cleaning: data row yang gender other remove

Research Questions

Research Question 1

Code
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.1     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.2.1
✔ purrr     1.0.4     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
ggplot(filtered_data, aes(x=age, y=focus_lvl_diff)) +
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(x="Age",
       y="Difference in Focus Level",
       title="Age vs ∆ Focus") 
`geom_smooth()` using formula = 'y ~ x'

Code
library(ggplot2)
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
Code
p = ggplot(filtered_data, aes(x=age, y=focus_lvl_diff, colour = gender)) +
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(x="Age",
       y="Difference in Focus Level",
       title="Age vs ∆ Focus") 

ggplotly(p)
`geom_smooth()` using formula = 'y ~ x'
Code
p = ggplot(mutated_data, aes(x=age, y=focus_lvl_diff, colour = env)) +
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(x="Age",
       y="Difference in Focus Level",
       title="Age vs ∆ Focus") 

ggplotly(p)
`geom_smooth()` using formula = 'y ~ x'
Code
ggplot(mutated_data, aes(x=age, y=focus_lvl_diff, colour = choice)) +
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(x="Age",
       y="Difference in Focus Level",
       title="Age vs ∆ Focus") 
`geom_smooth()` using formula = 'y ~ x'

Code
library(ggplot2)

# Fitting the model
model = lm(focus_lvl_diff ~ age, data = mutated_data)

# Create a residual plot
ggplot(model, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
  labs(title = "Residuals vs. Fitted Values")

Code
filtered_snowy = filter(mutated_data, env == 'Snowy')
filtered_sunny = filter(mutated_data, env == 'Sunny')
Code
model_snowy = lm(focus_lvl_diff ~ age, data = filtered_snowy)

# Create a residual plot
ggplot(model_snowy, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
  labs(title = "Residuals vs. Fitted Values")

Code
model_sunny = lm(focus_lvl_diff ~ age, data = filtered_sunny)

# Create a residual plot
ggplot(model_sunny, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
  labs(title = "Residuals vs. Fitted Values")

how to plot the residual for both models? should we make two different tables?

Research Question 2

Articles

2 articles max 50 words

Acknowledgement

References

Code
library(readxl)
sheets = excel_sheets("all_data.xlsx")
print(sheets)
[1] "data_dictionary" "experiment1"     "experiment2"     "experiment3"    
[5] "pooled123"      
Code
all_data = read_excel("all_data.xlsx", sheet = sheets[5])
head(all_data)
# A tibble: 6 × 82
  Date                  PID BSSQ_1 BSSQ_2 BSSQ_3 BSSQ_4 BSSQ_5 BSSQ_6 BSSQ_7
  <dttm>              <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 2021-06-30 16:55:00  1201      0      0      0      0      0      0      0
2 2021-06-30 17:46:00  1351      4      2      0      1      4      0      0
3 2021-06-30 18:25:00  2351      1      7      0      0      2      0      0
4 2021-06-30 18:59:00  1151      0      0      0      0      0      0      0
5 2021-06-30 20:25:00  2201      0      0      0      0      0      0      0
6 2021-06-30 21:05:00  1251      2      5      1      0      4      0      0
# ℹ 73 more variables: BSSQ_8 <dbl>, BSSQ_9 <dbl>, BSSQ_10 <dbl>,
#   BSSQ_11 <dbl>, BSSQ_12 <dbl>, BSSQ_13 <dbl>, BSSQ_14 <dbl>, BSSQ_15 <dbl>,
#   BSSQ_16 <dbl>, BSTAI_1 <chr>, BSTAI_2 <chr>, BSTAI_3 <chr>, BSTAI_4 <chr>,
#   BSTAI_5 <chr>, BSTAI_6 <chr>, BE <dbl>, BVRA <dbl>, choice_env <chr>,
#   no_choice_env <chr>, PSTAI_1 <chr>, PSTAI_2 <chr>, PSTAI_3 <chr>,
#   PSTAI_4 <chr>, PSTAI_5 <chr>, PSTAI_6 <chr>, PE <dbl>, PVRA <dbl>,
#   ASSQ_1 <dbl>, ASSQ_2 <dbl>, ASSQ_3 <dbl>, ASSQ_4 <dbl>, ASSQ_5 <dbl>, …
Code
library(dplyr)

data = all_data %>% select(age, BSSQ_5, ASSQ_5)
head(data)
# A tibble: 6 × 3
    age BSSQ_5 ASSQ_5
  <dbl>  <dbl>  <dbl>
1    30      0      2
2    37      4      3
3    28      2      1
4    39      0      0
5    37      0      0
6    18      4      0
Code
mutated_data = mutate(data, focus_lvl_diff = ASSQ_5 - BSSQ_5)
head(mutated_data)
# A tibble: 6 × 4
    age BSSQ_5 ASSQ_5 focus_lvl_diff
  <dbl>  <dbl>  <dbl>          <dbl>
1    30      0      2              2
2    37      4      3             -1
3    28      2      1             -1
4    39      0      0              0
5    37      0      0              0
6    18      4      0             -4

very important No

  • bullet
  • bullet
Code
library(tidyverse)
# ggplot(mutated_data, aes(x=age, y=focus_lvl_diff, colour = gender)) +
  #geom_point() + 
  #geom_smooth(method = "lm", se = FALSE) + 
  #labs(x="Age",
   #    y="Difference in Focus Level",
    #   title="Age vs ∆ Focus") 
Code
library(ggplot2)


# Fitting the model
model = lm(focus_lvl_diff ~ age, data = mutated_data)

# Create a residual plot
ggplot(model, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
  labs(title = "Residuals vs. Fitted Values")

Code
ggplot(mutated_data, aes(x=age)) + geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Code
ggplot(mutated_data, aes(x=focus_lvl_diff)) + geom_bar()

just say that lm isn’t the perfect fit for the data. Dive deeper to the data, look into journals abt the correlation you’re researching.

how the effect of using vr